# function for fixing legend
make_nice_effect_quantiles <- function(df, invar, outvar) {
  mutate(
    df,
    {{ outvar }} := recode({{ invar }}, 
                           effect_20 = "Effect at 20% quantile",
                            effect_50 = "Effect at 50% quantile",
                            effect_80 = "Effect at 80% quantile")
    )
}

effect_scale <- list(
  scale_color_manual(values = c(
    "Effect at 20% quantile" = colorspace::lighten("#007FA8", .7),
    "Effect at 50% quantile" = colorspace::lighten("#007FA8", .4),
    "Effect at 80% quantile" = "#007FA8"
  ))
)
hm <- brm(file = "final_models/mix_final_vienna.rds", file_refit = "never")

Posterior predictive check

pp_check(hm, ndraws = 10) +
  coord_cartesian(xlim = c(0, 8000))

pred_vis <- function(df, model, country_selection,
                     var_for_offset = base$P_top10, alpha = 1, ndraws = 1000) {
  scale_offset <- attributes(var_for_offset)[["scaled:center"]]
  get_back <- function(df) mutate(df, P_top10 = exp(scale_offset + P_top10))
  
  df %>%
    filter(country == country_selection) %>%
    modelr::data_grid(P_top10, country, field) %>%
    add_predicted_draws(model, ndraws = ndraws, re_formula = NULL) %>%
    get_back() %>% 
    ggplot(aes(P_top10, .prediction)) +
    stat_interval() +
    scale_color_manual(values = colorspace::lighten(clrs[4], c(.8, .67, .42))) +
    scale_y_continuous(labels = dollar) +
    geom_jitter(aes(y = APC_in_dollar), alpha = alpha, 
                position = position_jitter(width = 5, height = 50),
                data = filter(df, country == country_selection) %>% get_back()) +
    facet_wrap(vars(field)) +
    labs(y = "Predicted vs. actual APC", x = expression(P["top 10%"]),
         color = "Credible interval") +
    # theme_minimal(base_family = "Hind") +
    theme_clean() +
    theme(legend.position = "bottom", panel.grid.minor = element_blank())
}
pred_vis(base, hm, "Austria", alpha = .5)

pred_vis(base, hm, "Brazil", alpha = .2)

This updated model fares much better for Brazil. The predictions are still not ideal (underestimating higher APCs of ~2000), but overall much better than the previous model.

pred_vis(base, hm, "China", alpha = .15)

pred_vis(base, hm, "United States", alpha = .2)

pred_vis(base, hm, "Turkey", alpha = .7)

Model variances and covariances

summary(hm) 
##  Family: mixture(hurdle_lognormal, hurdle_lognormal) 
##   Links: mu1 = identity; sigma1 = identity; hu1 = logit; mu2 = identity; sigma2 = identity; hu2 = logit; theta1 = identity; theta2 = identity 
## Formula: APC_in_dollar | weights(total_weight) ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country) 
##          hu1 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
##          hu2 ~ 1 + P_top10 + (1 + P_top10 | field) + (1 + P_top10 | country)
##          theta1 ~ 1 + (1 | field)
##    Data: base (Number of observations: 116912) 
##   Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
##          total post-warmup draws = 4000
## 
## Group-Level Effects: 
## ~country (Number of levels: 69) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept)                  0.47      0.06     0.37     0.59 1.00
## sd(mu1_P_top10)                    0.05      0.04     0.00     0.16 1.00
## sd(mu2_Intercept)                  0.03      0.01     0.02     0.04 1.01
## sd(mu2_P_top10)                    0.01      0.00     0.00     0.02 1.00
## sd(hu1_Intercept)                  1.04      0.18     0.73     1.44 1.00
## sd(hu1_P_top10)                    0.17      0.11     0.01     0.41 1.00
## sd(hu2_Intercept)                  1.93      0.22     1.54     2.40 1.00
## sd(hu2_P_top10)                    0.26      0.13     0.02     0.53 1.00
## cor(mu1_Intercept,mu1_P_top10)    -0.19      0.34    -0.81     0.51 1.00
## cor(mu2_Intercept,mu2_P_top10)    -0.25      0.37    -0.82     0.57 1.00
## cor(hu1_Intercept,hu1_P_top10)    -0.32      0.38    -0.87     0.57 1.00
## cor(hu2_Intercept,hu2_P_top10)    -0.21      0.31    -0.77     0.43 1.00
##                                Bulk_ESS Tail_ESS
## sd(mu1_Intercept)                   931     2059
## sd(mu1_P_top10)                     622      979
## sd(mu2_Intercept)                  1681     2513
## sd(mu2_P_top10)                    1347     1702
## sd(hu1_Intercept)                  1844     2535
## sd(hu1_P_top10)                    1186     1658
## sd(hu2_Intercept)                  1495     2460
## sd(hu2_P_top10)                     573      960
## cor(mu1_Intercept,mu1_P_top10)     3926     2577
## cor(mu2_Intercept,mu2_P_top10)     3996     2803
## cor(hu1_Intercept,hu1_P_top10)     4009     2740
## cor(hu2_Intercept,hu2_P_top10)     3047     2402
## 
## ~field (Number of levels: 19) 
##                                Estimate Est.Error l-95% CI u-95% CI Rhat
## sd(mu1_Intercept)                  0.28      0.06     0.19     0.42 1.00
## sd(mu1_P_top10)                    0.07      0.02     0.04     0.13 1.00
## sd(mu2_Intercept)                  0.44      0.08     0.31     0.63 1.00
## sd(mu2_P_top10)                    0.02      0.01     0.01     0.03 1.00
## sd(hu1_Intercept)                  1.06      0.21     0.71     1.54 1.01
## sd(hu1_P_top10)                    0.15      0.07     0.02     0.30 1.00
## sd(hu2_Intercept)                  2.30      0.33     1.72     3.02 1.00
## sd(hu2_P_top10)                    0.46      0.11     0.28     0.71 1.00
## sd(theta1_Intercept)               0.71      0.20     0.41     1.18 1.00
## cor(mu1_Intercept,mu1_P_top10)     0.18      0.30    -0.43     0.72 1.00
## cor(mu2_Intercept,mu2_P_top10)    -0.21      0.42    -0.85     0.66 1.00
## cor(hu1_Intercept,hu1_P_top10)    -0.56      0.29    -0.94     0.17 1.00
## cor(hu2_Intercept,hu2_P_top10)     0.31      0.24    -0.21     0.72 1.00
##                                Bulk_ESS Tail_ESS
## sd(mu1_Intercept)                  1253     1855
## sd(mu1_P_top10)                    2052     3224
## sd(mu2_Intercept)                  1147     2067
## sd(mu2_P_top10)                    1620     1944
## sd(hu1_Intercept)                  1367     2317
## sd(hu1_P_top10)                    1480     1124
## sd(hu2_Intercept)                  1854     2309
## sd(hu2_P_top10)                    2116     2724
## sd(theta1_Intercept)               1146     2105
## cor(mu1_Intercept,mu1_P_top10)     3688     2998
## cor(mu2_Intercept,mu2_P_top10)     4088     2820
## cor(hu1_Intercept,hu1_P_top10)     3661     2506
## cor(hu2_Intercept,hu2_P_top10)     2797     2487
## 
## Population-Level Effects: 
##                  Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## mu1_Intercept        6.75      0.10     6.55     6.95 1.00     1051     1749
## hu1_Intercept       -0.62      0.34    -1.31     0.07 1.01     1084     1595
## mu2_Intercept        7.38      0.10     7.17     7.59 1.00      552      962
## hu2_Intercept        0.01      0.57    -1.14     1.09 1.01      795     1218
## theta1_Intercept    -0.69      0.18    -1.03    -0.32 1.00     1070     1683
## mu1_P_top10          0.17      0.03     0.11     0.24 1.00     2146     1816
## hu1_P_top10          0.02      0.09    -0.14     0.22 1.00     2426     2541
## mu2_P_top10          0.01      0.01    -0.00     0.03 1.00     2498     2451
## hu2_P_top10         -0.11      0.15    -0.41     0.18 1.00     1982     2623
## 
## Family Specific Parameters: 
##        Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma1     0.87      0.01     0.85     0.89 1.00     8372     3006
## sigma2     0.21      0.00     0.20     0.21 1.00     8462     2979
## 
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

“Marginal effects”

Here we compute marginal effects manually, by making predictions for a given x (P_top10) and then the same x * 1.01, i.e., increasing x (on the original scale) by 1%, and then comparing the predictions.

Fields

scale_offset <- attributes(base$P_top10)[["scaled:center"]]
x1_identity <- 1500
x2_identity <- x1_identity * 1.1
x1_log <- log(x1_identity) - scale_offset
x2_log <- log(x2_identity) - scale_offset


contrast <- predictions(
  hm,
  newdata = datagrid(P_top10 = c(x1_log, x2_log),
                     country = "Brazil", field = unique(base$field))) %>% 
  posteriordraws()
  
contrast_recomputed <- contrast %>% 
  mutate(x = factor(P_top10, labels = c("base", "step"))) %>% 
  pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid), 
              names_from = x, values_from = draw) %>% 
  mutate(contrast = step / base - 1)
contrast_recomputed %>% 
  ggplot(aes(contrast, fct_reorder(field, contrast))) +
  stat_halfeye() +
  scale_x_continuous(labels = percent)

This is very sensitive to the respective country. Maybe we can recompute an average marginal effect after all?

average_draws <- function(model, orig_var, q = .5, 
                                var_for_offset = base$P_top10) {
  scale_offset <- attributes(var_for_offset)[["scaled:center"]]
  
  x1_identity <- quantile(orig_var, q)
  x2_identity <- x1_identity * 1.01
  x1_log <- log(x1_identity) - scale_offset
  x2_log <- log(x2_identity) - scale_offset
  
  
  contrast_all <- predictions(
    model,
    newdata = datagrid(P_top10 = c(x1_log, x2_log),
                       country = unique(base$country), 
                       field = unique(base$field))) %>% 
    posteriordraws()
    
  contrast_all %>% 
    mutate(x = factor(P_top10, labels = c("base", "step"))) %>% 
    pivot_wider(-c(predicted, conf.low, conf.high, P_top10, rowid), 
                names_from = x, values_from = draw) %>% 
    mutate(contrast = step / base - 1)
}

summarise_by <- function(contrast_df, var = field) {
  contrast_df %>% 
    group_by({{ var }}, drawid) %>% 
    summarise(effect = mean(contrast))
}
plot_effect <- function(contrast_df, location = "the median") {
  contrast_df %>% 
    ggplot(aes(effect, fct_reorder(field, effect))) +
    stat_halfeye(.width = c(.5, .9), point_interval = "median_hdi") +
    scale_x_continuous(labels = percent) +
    labs(
      y = NULL, 
      x = glue::glue("% change of APC for 1% change of P_top10% at {location}"),
      caption = "Averaged predictions over all countries.") +
    theme_clean() +
    coord_cartesian(xlim = c(-0.005, 0.005))
}
contrast_20 <- average_draws(hm, df$P_top10, q = .2)
contrast_50 <- average_draws(hm, df$P_top10, q = .5)
contrast_80 <- average_draws(hm, df$P_top10, q = .8)
contrast_20_field <- summarise_by(contrast_20, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_20_field %>% 
  plot_effect("the 20% quantile")

This seems reasonable, but would need to further validate.

At 50%

contrast_50_field <- summarise_by(contrast_50, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_50_field %>% 
  plot_effect()

contrast_80_field <- summarise_by(contrast_80, field)
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.
contrast_80_field %>% 
  plot_effect()

Compare all three

all_joined <- bind_rows(
  rename(contrast_50_field, effect_50 = effect),
  rename(contrast_80_field, effect_80 = effect),
  rename(contrast_20_field, effect_20 = effect)
)
p <- all_joined %>% 
  pivot_longer(contains("effect"), values_to = "effect") %>% 
  drop_na() %>% 
  make_nice_effect_quantiles(name, name) %>% 
  ggplot(aes(effect, fct_reorder(field, effect), colour = name)) +
  geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
  stat_pointinterval(position = position_dodge(width = .5),
                     .width = c(.5, .9)) +
  scale_x_continuous(labels = percent) +
  labs(
    y = NULL, 
    x = expression(paste("% change of APC for 1% change of ", P["top 10%"], 
                         " at given quantiles")),
    caption = "Predictions averaged over all countries.") +
  theme_clean() +
  coord_cartesian(xlim = c(-0.005, 0.005)) +
  guides(colour = guide_legend(reverse = FALSE))
p + 
  effect_scale +
  theme(legend.position = "top") + 
  labs(colour = NULL)

custom_pal <- sequential_hcl(5, palette = "Mako") %>% 
  lighten(amount = .1)
p + scale_color_manual(values = c(
    effect_20 = custom_pal[3],
    effect_50 = custom_pal[3],
    effect_80 = custom_pal[4]
  ))

Questions that arise: - why in some fields stronger/weaker effect for larger/smaller P_top10? Is this also associated with hurdle component? - Especially: why physics and mathematics negative? because of hurdle?

  • Effect is diminishing for all fields with sufficient data.

Need to put into context of overall averages: give average per field (from model or full set)

Intercept at median

contrast_50 %>% 
  group_by(field, drawid) %>% 
  summarise(intercept = mean(base)) %>% 
  ggplot(aes(intercept, fct_reorder(field, intercept))) +
  stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
               point_interval = "median_hdi") +
  scale_x_continuous(labels = scales::dollar) +
  theme_clean() +
  labs(y = NULL, x = expression(paste("Estimated APC at median of ",
                                      P["top 10%"])))
## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.

Countries

contrast_20_country <- summarise_by(contrast_20, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_50_country <- summarise_by(contrast_50, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
contrast_80_country <- summarise_by(contrast_80, country)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
all_countries <- bind_rows(
  rename(contrast_20_country, effect_20 = effect),
  rename(contrast_50_country, effect_50 = effect),
  rename(contrast_80_country, effect_80 = effect),
) %>% 
  pivot_longer(contains("effect"), values_to = "effect") %>% 
  drop_na()
p_country <- all_countries %>% 
  ggplot(aes(effect, fct_reorder(country, effect), colour = name)) +
  geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
  stat_pointinterval(position = position_dodge(width = .5),
                     .width = c(.5, .9), point_interval = "median_hdi") +
  scale_x_continuous(labels = percent) +
  labs(
    y = NULL, 
    x = glue::glue("% Change of APC for 1% change of P_top10% at given quantiles"),
    caption = "Averaged predictions over all countries.") +
  theme_clean() +
  coord_cartesian(xlim = c(-0.001, 0.005)) +
  guides(colour = guide_legend(reverse = TRUE))
p_country + scale_color_manual(values = c(
    effect_20 = colorspace::lighten("#007FA8", .7),
    effect_50 = colorspace::lighten("#007FA8", .4),
    effect_80 = "#007FA8"
  ))

Group by continent

country_identifier <- base %>% 
  distinct(country, region, country_code) %>% 
  drop_na()

all_countries_w_region <- all_countries %>% 
  left_join(country_identifier)
## Joining, by = "country"
plot_countries_by_region <- function(df) {
  df %>% 
    make_nice_effect_quantiles(name, color_label) %>% 
    ggplot(aes(effect, fct_reorder(country, effect), colour = color_label)) +
    geom_vline(xintercept = 0, colour = "grey55", linetype = 2) +
    stat_pointinterval(position = position_dodge(width = .5),
                       .width = c(.5, .9), point_interval = "median_hdi") +
    scale_x_continuous(labels = percent) +
    labs(
      y = NULL, 
      x = expression(paste("% Change of APC for 1% change of ", P["top 10%"], 
                         " at given quantiles")),
      caption = "Predictions averaged over all fields.",
      colour = NULL) +
    theme_clean() +
    facet_grid(rows = vars(str_wrap(region, 9)), space = "free_y", scales = "free_y") +
    coord_cartesian(xlim = c(-0.001, 0.005)) +
    guides(colour = guide_legend(reverse = FALSE, override.aes = list(size = 2))) +
    theme(legend.position = "top") +
    effect_scale
}

p_europe_subsahara <- all_countries_w_region %>% 
  filter(str_detect(region, "Europe|Sahara")) %>% 
  plot_countries_by_region()

p_rest <- all_countries_w_region %>% 
  filter(!str_detect(region, "Europe|Sahara")) %>% 
  plot_countries_by_region()
p_europe_subsahara

p_rest

Relate to gdp

gdp <- WDI::WDI(start = 2019, end = 2019)

country_effect_with_gdp <- all_countries_w_region %>% 
  left_join(gdp, by = c("country_code" = "iso2c"))
country_effect_with_gdp
## # A tibble: 828,000 x 9
##    country.x drawid name     effect region country_code country.y NY.GDP.PCAP.KD
##    <chr>     <fct>  <chr>     <dbl> <chr>  <chr>        <chr>              <dbl>
##  1 Algeria   1      effec~  1.65e-3 Middl~ DZ           Algeria            4115.
##  2 Algeria   2      effec~  3.66e-4 Middl~ DZ           Algeria            4115.
##  3 Algeria   3      effec~  9.70e-4 Middl~ DZ           Algeria            4115.
##  4 Algeria   4      effec~  1.16e-3 Middl~ DZ           Algeria            4115.
##  5 Algeria   5      effec~  2.40e-4 Middl~ DZ           Algeria            4115.
##  6 Algeria   6      effec~  3.13e-3 Middl~ DZ           Algeria            4115.
##  7 Algeria   7      effec~ -2.50e-4 Middl~ DZ           Algeria            4115.
##  8 Algeria   8      effec~  3.01e-3 Middl~ DZ           Algeria            4115.
##  9 Algeria   9      effec~  9.91e-4 Middl~ DZ           Algeria            4115.
## 10 Algeria   10     effec~  1.33e-3 Middl~ DZ           Algeria            4115.
## # ... with 827,990 more rows, and 1 more variable: year <int>
p <- country_effect_with_gdp %>% 
  filter(name == "effect_50") %>% 
  rename(gdp = NY.GDP.PCAP.KD) %>% 
  group_by(country_code, country.x, gdp, region) %>% 
  point_interval(effect, .width = .5, .point = median, .interval = hdi) %>% 
  ggplot(aes(gdp, effect, colour = region, label = country.x)) +
  geom_pointrange(aes(ymin = .lower, ymax = .upper)) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::comma) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  labs(colour = NULL, x = "GDP per capita", 
       y = "% increase in APC for 1% increase of P_top10 at the median") +
  theme_clean() +
  theme(legend.position = "top")  +
  coord_cartesian(ylim = c(0, .003))
p
## Warning: Removed 1 rows containing missing values (geom_pointrange).

plotly::ggplotly(p)

This could still be improved by adding number of universities (or papers) as a size variable.

unis_p_country <- df %>% 
  distinct(country, University) %>% 
  count(country, name = "n_universities") 
pdata <- country_effect_with_gdp %>% 
  filter(name == "effect_50") %>% 
  rename(gdp = NY.GDP.PCAP.KD) %>% 
  group_by(country_code, country.x, gdp, region) %>% 
  point_interval(effect, .width = .5, .point = median, .interval = hdi) %>% 
  left_join(unis_p_country, by = c("country.x" = "country"))

labels <- pdata %>% 
  mutate(label = case_when(
    country.x %in% c("China", "India", "Iran", "Germany", "United States",
                      "Brazil", "Luxembourg", "Czech Republic") ~ country.x,
    TRUE ~ ""))

pdata %>% 
  ggplot(aes(gdp, effect, colour = region, label = "")) +
  geom_linerange(aes(ymin = .lower, ymax = .upper, alpha = n_universities)) +
  geom_point(aes(alpha = n_universities), size = 2) +
  ggrepel::geom_text_repel(data = labels, aes(label = label),
                            show.legend = FALSE, max.overlaps = Inf,
                           box.padding = 1, min.segment.length = 0) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::comma) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  # scale_size_continuous(trans = "sqrt") +
  scale_alpha_continuous(range = c(.2, 1), trans = "log10") +
  labs(colour = "Region", x = "GDP per capita", alpha = "Number of universities",
       y = "% increase in APC for 1% increase of P_top10 at the median") +
  theme_clean() +
  theme(legend.position = "top", legend.box = "vertical")  +
  coord_cartesian(ylim = c(0, .003))
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

contrast_50 %>% 
  group_by(country, drawid) %>% 
  summarise(intercept = mean(base)) %>% 
  ggplot(aes(intercept, fct_reorder(country, intercept))) +
  stat_halfeye(.width = c(.5, .9), fill = colorspace::lighten("#007FA8"),
               point_interval = "median_hdi") +
  scale_x_continuous(labels = scales::dollar) +
  theme_clean() +
  labs(y = NULL, x = expression(paste("Estimated APC at median of ",
                                      P["top 10%"])))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.

Not sure whether this is helpful. What we show is quite abstract (APC at hypothetical level of P_top10, averaged across all fields (weighting equally).

The same would apply to the field intercepts above.